# encoding:utf-8
import re #
from pathlib import Path
import os  
  
def file_tiqu(file_dir):   
	for root, dirs, files in os.walk(file_dir):
		list_none=[]
		if files==list_none:
			continue
		#print(root)



		p = Path(root)  # 初始化构造Path对象
		FileList = list(p.glob("./*.cha"))  # 得到所有的cha文件
		#f = open('chang1_01_out.txt', 'w', encoding='UTF-8')
		
		if FileList==list_none:
			continue
		for file in FileList:
			file_object = open(file, 'rU', encoding='UTF-8')
			for line in file_object:
				pattern_id="@ID"
				if re.search(pattern_id,line):
					#print(re.search(pattern_id,line))
					pattern_chi="CHI\|" #|实际上是需要转义的
					if re.search(pattern_chi,line):
						#print(re.search(pattern_chi,line).span()[1])
						str_age=""
						for i in range(re.search(pattern_chi,line).span()[1],len(line)):
							if line[i]=='|':
								break
							str_age=str_age+str(line[i])
			#print(str_age)				
			file_object.close()

			file_object = open(file, 'rU', encoding='UTF-8')
			for line in file_object:
				pattern_id="@ID"
				if re.search(pattern_id,line):
					#print(re.search(pattern_id,line))
					pattern_chi="CHI\|" #|实际上是需要转义的
					if re.search(pattern_chi,line):
						#print(re.search(pattern_chi,line).span()[1])
						str_gender=""
						start_index=0
						for i in range(re.search(pattern_chi,line).span()[1],len(line)):							
							if line[i]=='|':
								start_index=i+1
								break
						for i in range(start_index,len(line)):							
							if line[i]=='|':
								break
							str_gender=str_gender+str(line[i])
			#print(str_gender)				
			file_object.close()

			file_object = open(file, 'rU', encoding='UTF-8')
			str1=file.name.replace('.cha','')
			#print(str1)
			f=open(root+"\\"+str1+"%"+str_age+"%"+str_gender+"%"+"_out.txt",'w',encoding='UTF-8')  #对每个cha文件创建对应的输出文件	
	
			for line in file_object:
				chi_line = re.search("\[*CHI:.*\]*", line)
				#print(chi_line)		
				pattern1 = re.compile(r'[^\u4e00-\u9fa5]')
				chinese_line = re.sub(pattern1, "", str(chi_line))
				if chinese_line:
					#print(chinese_line)
					f.writelines(chinese_line+'\n')
			file_object.close()
			f.close()
      
file_tiqu('E:\周丹丽研究生\语言学python\语料')



